In [1]:
'''
File name: project.ipynb
Date created: 03/11/2019
Date last modified: 20/12/2019
Python Version: 3.7.4
''';
In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd

import plotly
import plotly.express as px
import plotly.graph_objs as go

from utils import constants as cst
from utils import clean_database
from utils import areas_handler
from utils import chicago_map_handler as maps

# Set auto-reload 
%load_ext autoreload
%autoreload 2

Load Databases

In this section we load and clean the databases. All methods used are described in utils/clean_database.

Table of Contents

In [3]:
# Load the areas dataframe 
areas_DF = gpd.read_file(cst.AREAS_PATH)

# Clean the dataframe
areas_DF = clean_database.clean_areas_df(areas_DF)
areas_DF.head()
Out[3]:
community_area_num community_area_name shape_area shape_len geometry
0 35 douglas 4.600462e+07 31027.054510 POLYGON ((-87.60914 41.84469, -87.60915 41.844...
1 36 oakland 1.691396e+07 19565.506153 POLYGON ((-87.59215 41.81693, -87.59231 41.816...
2 37 fuller park 1.991670e+07 25339.089750 POLYGON ((-87.62880 41.80189, -87.62879 41.801...
3 38 grand boulevard 4.849250e+07 28196.837157 POLYGON ((-87.60671 41.81681, -87.60670 41.816...
4 39 kenwood 2.907174e+07 23325.167906 POLYGON ((-87.59215 41.81693, -87.59215 41.816...
In [4]:
# Load the food inspections dataframe
food_inspections_DF = pd.read_csv(cst.FOOD_INSPECTIONS_PATH, sep = ',', header = 0, 
                   names = cst.FOOD_INSPECTIONS_COL_NAMES, index_col = None, error_bad_lines=False
                   )

# Clean the dataframe
food_inspections_DF = clean_database.clean_food_inspections_df(food_inspections_DF, areas_DF)
food_inspections_DF.head()
Out[4]:
inspection_id DBA_name AKA_name license_num facility_type risk address city state zip inspection_date inspection_type result violations lat lng location
0 2345323 ARMAND'S PIZZERIA ARMAND'S PIZZERIA 2698587.0 Restaurant Risk 1 (High) 29 N WACKER DR chicago IL 60606.0 2019-11-08 License Pass w/ Conditions 3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E... 41.882700 -87.636638 {'latitude': '-87.63663755997726', 'longitude'...
1 2345321 GOPUFF GOPUFF 2684560.0 Grocery Store Risk 3 (Low) 1801 W WARNER AVE chicago IL 60613.0 2019-11-08 License Re-Inspection Pass NaN 41.956846 -87.674395 {'latitude': '-87.6743946694658', 'longitude':...
2 2345325 TACO MAX MEXICAN GRILL TACO MAX MEXICAN GRILL 2699082.0 Restaurant Risk 1 (High) 3402 W MONTROSE AVE chicago IL 60618.0 2019-11-08 License Pass w/ Conditions 3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E... 41.961238 -87.713284 {'latitude': '-87.71328438033805', 'longitude'...
3 2345370 CAFE BALLOU CAFE BALLOU & DELI 2433048.0 Restaurant Risk 1 (High) 939 N WESTERN AVE chicago IL 60622.0 2019-11-08 Canvass No Entry NaN 41.898706 -87.686773 {'latitude': '-87.68677251748062', 'longitude'...
4 2345376 GARIBAY POULTRY GARIBAY POULTRY 1908500.0 CUSTOM POULTRY SLAUGHTER Risk 2 (Medium) 2100 S CALIFORNIA AVE chicago IL 60608.0 2019-11-08 Complaint Pass 47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE... 41.853688 -87.695652 {'latitude': '-87.69565174882821', 'longitude'...
In [5]:
# Load the socio-economic indicators dataframe
socio_economic_DF = pd.read_csv(cst.SOCIO_ECONOMIC_INDICATORS_PATH, sep = ',', header = 0, 
                   names = cst.SOCIO_ECONOMIC_COL_NAMES, index_col = None, error_bad_lines=False
                   )

# Clean the dataframe
socio_economic_DF = clean_database.clean_socio_economic_df(socio_economic_DF)
socio_economic_DF.head()
Out[5]:
community_area_num community_area_name housing_crowded_perc housholds_below_poverty_perc aged_16_or_more_unemployed_perc aged_25_or_more_without_high_school_diploma_perc aged_under_18_or_over_64_perc per_capita_income hardship_idx
0 1 rogers park 7.7 23.6 8.7 18.2 27.5 23939 39.0
1 2 west ridge 7.8 17.2 8.8 20.8 38.5 23040 46.0
2 3 uptown 3.8 24.0 8.9 11.8 22.2 35787 20.0
3 4 lincoln square 3.4 10.9 8.2 13.4 25.5 37524 17.0
4 5 north center 0.3 7.5 5.2 4.5 26.2 57123 6.0
In [6]:
# Load the life expectancy dataframe
life_expectancy_DF = pd.read_csv(cst.LIFE_EXPECTANCY_PATH, sep = ',', header = 0, 
                   names = cst.LIFE_EXPECTANCY_COL_NAMES, index_col = None, error_bad_lines=False
                   )

# Clean the dataframe

life_expectancy_DF = clean_database.clean_socio_economic_df(life_expectancy_DF)
life_expectancy_DF.head()
Out[6]:
community_area_num community_area_name life_exp_1990 lower_95_perc_CI_1990 upper_95_perc_CI_1990 life_exp_2000 lower_95_perc_CI_2000 upper_95_perc_CI_2000 life_exp_2010 lower_95_perc_CI_2010 upper_95_perc_CI_2010
0 1 rogers park 70.9 69.9 71.9 73.1 72.2 74.1 77.3 76.3 78.2
1 2 west ridge 76.9 76.1 77.8 78.1 77.3 78.8 80.3 79.5 81.1
2 3 uptown 64.0 63.1 64.9 71.7 70.8 72.7 76.0 75.1 76.9
3 4 lincoln square 74.2 73.1 75.4 76.8 75.8 77.8 80.5 79.3 81.6
4 5 north center 73.4 72.1 74.7 77.9 76.6 79.1 81.5 80.1 82.8

Complete Datasets

A few issues :

  1. We only have the area name for the life expectancy and the socio-economic dataframes. So one solution to be able to compare the different dataframes would be to add a column containing the area of a food inspection location. Since our analysis will be done on the different areas, this seems like the simplest approach to work around the problem.
  2. Some entries in food inspections DF have no lat/lng pair, but they do have an address. We find the lag/long given the address.

The solutions described are implemented in utils/areas_handler.

Table of Contents

In [7]:
# Filter locations where lng/lat are unknown
food_unknown_loc = food_inspections_DF[food_inspections_DF['lat'].isna()]
In [8]:
# Get unknown locations
unknown_locations = areas_handler.get_unknown_locations(food_unknown_loc)
In [9]:
# Check the locations not found by OpenStreetMaps
unknown_locations[pd.isnull(unknown_locations['lat'])].head()
Out[9]:
address lat lng
9 2011 N GRIFFIN BLVD NaN NaN
10 65 CARMINE ST NaN NaN
11 950 ESTES CT NaN NaN
12 108 W PARK NaN NaN
13 1307 CARDINAL DR NaN NaN
In [10]:
# Display the Chicago areas
# This map contains additional layers to visually check if the found locations are actually within the borders of the city
map_chicago = maps.create_chicago_map(with_community_areas=True)
map_chicago
Out[10]:
In [11]:
# Save map
map_chicago.save(outfile=cst.MAPS_PATH+"map_intro.html")

Description of the map

On the map above we can see the city of Chicago and the region where the facilities of the inspections presented in the food_inspections dataset are located. In the following, we will focus ang get some insights on that area.

In [12]:
# Use unknonwn_locations to fill lat and lng in the original dataframe food_inspections_DF
food_unknown_loc = food_unknown_loc.reset_index().merge(unknown_locations, on="address", how='left') \
                                   .set_index('index').drop(['lat_x', 'lng_x'], axis = 1) \
                                   .rename(columns={'lat_y':'lat','lng_y':'lng'})
food_inspections_DF.update(food_unknown_loc)

# Check which latitudes are still unknown 
print('%d food inspections still missing lat long info, out of %d. '%(food_inspections_DF.lat.isna().sum(), len(food_inspections_DF)))
103 food inspections still missing lat long info, out of 195796. 

So we have no more unknown locations in our food inspections dataframe.

In [13]:
# Resolve are numbers and delete unknown areas
# Takes a while if not already saved
food_inspections_DF = areas_handler.get_locations_with_area(food_inspections_DF, areas_DF)
print("Number of locations: " + str(food_inspections_DF.shape[0]))

# Drop locations not in the city of chicago 
food_inspections_DF = food_inspections_DF.dropna(subset=[cst.AREA_NUM])
print("Number of locations in the city of chicago: " + str(food_inspections_DF.shape[0]))
food_inspections_DF[cst.AREA_NUM] = food_inspections_DF[cst.AREA_NUM].astype(int)
Number of locations: 195796
Number of locations in the city of chicago: 192821

Maps

We visualize some of the data using folium to find connections

Table of Contents

Number of inspections map

In [14]:
# Create new dataframe with number of inspections per area
inspection_counts = food_inspections_DF[cst.AREA_NUM].value_counts().to_frame()
inspection_counts.reset_index(level=0, inplace=True)
inspection_counts.columns=[cst.AREA_NUM,'num_inspections']
inspection_counts[cst.AREA_NUM] = inspection_counts[cst.AREA_NUM].astype(str)
inspection_counts.sort_values('num_inspections');
In [15]:
rename_mapping = {"num_inspections": "number of inspections"}
heatmap_chicago = maps.heat_map(inspection_counts, "Number of Inspections", cst.AREA_NUM, "num_inspections", \
                                rename_mapping=rename_mapping)
heatmap_chicago
Out[15]:
In [16]:
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"num_inspections.html")

Description of the map

The heatmap above shows the number of inspections on the area we are focusing on and determined before. We can see that the more we approach the city center of Chicago, the higher the number of inspections.

However, it is also curious to see that in the north of Chicago, the number of inspections is higher than in the south. This could be due to a higher amount of establishment in this area; we this check below.

Number of Establishments

Here we determine and analyse the number of establishments per region area.

In [17]:
count_DF = food_inspections_DF.copy()
count_DF = count_DF.drop_duplicates(subset=['license_num'])
count_ser = count_DF[cst.AREA_NUM].value_counts().to_frame()
count_ser.reset_index(level=0, inplace=True)
count_ser.columns=[cst.AREA_NUM,'num_establishments']
count_ser[cst.AREA_NUM] = count_ser[cst.AREA_NUM].astype(str)
count_ser.sort_values('num_establishments');
In [18]:
rename_mapping = {"num_establishments": "number of establishments"}
heatmap_chicago = maps.heat_map(count_ser, "Number of Establishments", cst.AREA_NUM, 'num_establishments', \
                                rename_mapping=rename_mapping)
heatmap_chicago
Out[18]:
In [19]:
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"num_establishments.html")

Description of the map

The heatmap above shows the number of establishments on the area we want to analyze. Same as before, we observe that the more we approach the city center or the north of the city, the higher the number of establishments. Indeed, the number of inspections is correlated with the number of establishments; the more establishments we have on a given area, the higher the inspections will be on that area.

Average risk per inspection for each area

We compute a "risk_ser" dataframe, which has two columns containing the area number and the risk level. To simplify the risk values, we convert the risk in the food inspections dataframe to an integer.

In [20]:
def risk_to_num(x):
    if x == 'Risk 3 (Low)':
        return 1
    if x == 'Risk 2 (Medium)':
        return 2
    if x == 'Risk 1 (High)':
        return 3
    if x == 'All':
        return 2
    else:
        return x

risk_DF = food_inspections_DF.copy()
risk_DF = risk_DF.dropna(subset=['risk'])
risk_DF['risk'] = risk_DF['risk'].apply(lambda x : risk_to_num(x)).astype(int)
risk_ser = risk_DF.groupby(cst.AREA_NUM)['risk'].mean().to_frame()
risk_ser.reset_index(level=0, inplace=True)
risk_ser[cst.AREA_NUM] = risk_ser[cst.AREA_NUM].astype(str)

rounding = 3
risk_ser['risk'] = risk_ser['risk'].apply(lambda x: round(x, rounding))
In [21]:
heatmap_chicago = maps.heat_map(risk_ser, "Average risk", cst.AREA_NUM, 'risk')
heatmap_chicago
Out[21]:
In [22]:
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"average_risk.html")

Description of the map

The heatmap above shows the average risk of the establishments for each community area. The risk of an establishment shows how it could potentially affect the public's health with 1 being the highest and 3 the lowest. Furthermore, high risk establishments are inspected more frequently that low risk establishments.

Average result per inspection for each area

We compute a "result_ser" dataframe, which has two columns containing the area number and the average inspection result in that area. To simplify the result values, we convert the result in the food inspections dataframe to an integer between 0 (Inspection failed) and 2 (inspection passed).

In [23]:
food_inspections_DF.result
Out[23]:
0         Pass w/ Conditions
1                       Pass
2         Pass w/ Conditions
3                   No Entry
4                       Pass
                 ...        
195791                  Fail
195792                  Pass
195793                  Pass
195794                  Fail
195795                  Pass
Name: result, Length: 192821, dtype: object
In [24]:
def result_to_num(e):
        if e == 'Pass':
            return 2
        if e == 'Pass w/ Conditions':
            return 1
        if e == 'Fail':
             return 0


result_DF = food_inspections_DF.copy()
result_DF['result'] = result_DF['result'].apply(lambda x : result_to_num(x))
result_DF = result_DF.dropna(subset=['result'])
result_DF['result'] = result_DF['result'].astype(int)
result_ser = result_DF.groupby(cst.AREA_NUM)['result'].mean().to_frame()
result_ser.reset_index(level=0, inplace=True)
result_ser[cst.AREA_NUM] = result_ser[cst.AREA_NUM].astype(str)

rounding = 3
result_ser['result'] = result_ser['result'].apply(lambda x: round(x, rounding))
In [25]:
heatmap_chicago = maps.heat_map(result_ser, "Average result", cst.AREA_NUM, 'result', good_indicator=True)
heatmap_chicago
Out[25]:
In [26]:
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"average_result.html")

Description of the map

The heatmap above shows the average risk of the establishments for each community area. The risk of an establishment shows how it could potentially affect the public's health with 1 being the highest and 3 the lowest. Furthermore, high risk establishments are inspected more frequently that low risk establishments.

Number of inspections per establishment

In [27]:
#inspections_per_est = inspection_counts.merge(count_ser, left_on=cst.AREA_NUM, right_on=cst.AREA_NUM).drop(['index'], axis = 1)
inspections_per_est = inspection_counts.merge(count_ser, left_on=cst.AREA_NUM, right_on=cst.AREA_NUM)
rounding = 3
inspections_per_est['inspections_per_establishment'] = inspections_per_est.apply(lambda x : round(x.num_inspections/x.num_establishments,rounding), axis=1)
In [28]:
rename_mapping = {'inspections_per_establishment':'inspections per establishment'}
heatmap_chicago = maps.heat_map(inspections_per_est, "Number of Inspections per Establishment", cst.AREA_NUM, \
                                'inspections_per_establishment', rename_mapping=rename_mapping)
heatmap_chicago
Out[28]:
In [29]:
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"inspections_per_establishment.html")

Description of the map

The heatmap above shows the number of inspections per establishment and as highlighted before, we can see that the establishments with a high number of inspections present high average risks.

Violations

Each establishment can receive a violation number between 1-44 or 70 and the number is followed by a specific description of the findings that caused the violation to be issued. The higher the number, the better the description. Establishments collecting only high numbers might probably pass whereas the others might probably fail.

An inspection of an establishment can pass, pass with conditions or fail depending on these numbers. The 'pass' condition is given when the establishment were found to have no critical or serious violations. Establishments that received a 'pass with conditions' were found to have critical or serious violations but these violations were corrected during the inspection. Finally, the 'fail' condition is issued when the establishment were found to have critical or serious violations that were not correctable during the inspection.

We will analyse more deeply the violations for milestone 3.

Socioeconomic indicators

In [30]:
# Merge socio-economic and life expectancy df's on the area number and names
socio_life_merged_DF = socio_economic_DF.merge(life_expectancy_DF, how="left", on=["community_area_num", "community_area_name"])
In [31]:
# Select two of the following columns for the heatmap
socio_life_merged_DF[cst.AREA_NUM] = socio_life_merged_DF[cst.AREA_NUM].astype(str)
for c in socio_life_merged_DF.columns:
    print(c)
community_area_num
community_area_name
housing_crowded_perc
housholds_below_poverty_perc
aged_16_or_more_unemployed_perc
aged_25_or_more_without_high_school_diploma_perc
aged_under_18_or_over_64_perc
per_capita_income
hardship_idx
life_exp_1990
lower_95_perc_CI_1990
upper_95_perc_CI_1990
life_exp_2000
lower_95_perc_CI_2000
upper_95_perc_CI_2000
life_exp_2010
lower_95_perc_CI_2010
upper_95_perc_CI_2010
In [32]:
rename_mapping = {'per_capita_income':'per capita income ($)'}
heatmap_chicago = maps.heat_map(socio_life_merged_DF, "Per capita income 2010",cst.AREA_NUM ,'per_capita_income',\
                                good_indicator = True, rename_mapping=rename_mapping)
heatmap_chicago
Out[32]:
In [33]:
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"per_capita_income.html")
In [34]:
rename_mapping = {'life_exp_2010':'life expectancy 2010 (years)'}
heatmap_chicago = maps.heat_map(socio_life_merged_DF, "Life expectancy 2010",cst.AREA_NUM ,'life_exp_2010', \
                                good_indicator = True, rename_mapping=rename_mapping)
heatmap_chicago
Out[34]:
In [35]:
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"life_exp_2010.html")

We could compare the map above, displaying the life expectancies, with the percentage of housholds below the poverty line.

In [36]:
rename_mapping = {'housholds_below_poverty_perc':'housholds below poverty line (perc)'}
heatmap_chicago = maps.heat_map(socio_life_merged_DF, "housholds below poverty line as percentage", cst.AREA_NUM, \
                                'housholds_below_poverty_perc', rename_mapping=rename_mapping)
heatmap_chicago
Out[36]:
In [37]:
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"housholds_below_poverty_perc.html")

In the first map, some community areas have no entry for their life expectancies. Clearly the life expectancies are higher near the center of Chicago, and on it's north side. These are also the regions with lowest amount of housholds below poverty line. Around West Garfield Park, the life expectancy drops drastically, as do the housholds above poverty line.

Percentage of failing Restaurants and Stores

In [38]:
#Display Top20 inspected facility types
food_inspections_DF["facility_type"].value_counts()[:20]
Out[38]:
Restaurant                         128034
Grocery Store                       24743
School                              12107
Children's Services Facility         3077
Bakery                               2874
Daycare (2 - 6 Years)                2689
Daycare Above and Under 2 Years      2367
Long Term Care                       1351
Catering                             1151
Liquor                                854
Mobile Food Dispenser                 818
Daycare Combo 1586                    752
Mobile Food Preparer                  587
Golden Diner                          569
Hospital                              547
Wholesale                             536
TAVERN                                281
Daycare (Under 2 Years)               250
Special Event                         217
Shared Kitchen User (Long Term)       179
Name: facility_type, dtype: int64
In [39]:
# Create a filtered dataframe that only contains restaurants and stores
food_inspections_DF = food_inspections_DF.dropna()
filtered_inspections_DF = food_inspections_DF[(food_inspections_DF["facility_type"].str.lower().str.contains("store")) | (food_inspections_DF["facility_type"].str.lower().str.contains("restaurant"))]

count_DF = filtered_inspections_DF.copy()
count_DF = count_DF.drop_duplicates(subset=['license_num'])
count_ser = count_DF[cst.AREA_NUM].value_counts().to_frame()
count_ser.reset_index(level=0, inplace=True)
count_ser.columns=[cst.AREA_NUM,'num_establishments']
count_ser[cst.AREA_NUM] = count_ser[cst.AREA_NUM].astype(str)
count_ser.sort_values('num_establishments');

inspection_counts = filtered_inspections_DF[cst.AREA_NUM].value_counts().to_frame()
inspection_counts.reset_index(level=0, inplace=True)
inspection_counts.columns=[cst.AREA_NUM,'num_inspections']
inspection_counts[cst.AREA_NUM] = inspection_counts[cst.AREA_NUM].astype(str)
inspection_counts.sort_values('num_inspections');


def extract_violation_num(x):
    return int(x.split('.')[0])

def result_to_pass(x):
    if "pass" in x.lower():
        return 0
    else:
        return 100
    

failure_DF = filtered_inspections_DF.copy()
failure_DF = failure_DF.dropna(subset=['violations'])
failure_DF = failure_DF[(failure_DF["result"].str.lower().str.contains("pass")) | (failure_DF["result"].str.lower().str.contains("fail"))]
failure_DF['pass_num'] = failure_DF['violations'].apply(lambda x : extract_violation_num(x)).astype(int)
failure_DF['failed inspections (%)'] = failure_DF['result'].apply(lambda x : result_to_pass(x)).astype(int)
failure_ser = failure_DF.groupby(cst.AREA_NUM)['failed inspections (%)'].mean().to_frame()
failure_ser.reset_index(level=0, inplace=True)
failure_ser[cst.AREA_NUM] = failure_ser[cst.AREA_NUM].astype(str)

rounding = 3
failure_ser['failed inspections (%)'] = failure_ser['failed inspections (%)'].apply(lambda x: round(x, rounding))
In [40]:
heatmap_chicago = maps.heat_map(failure_ser, "Percentage of failed inspections for restaurants and stores", cst.AREA_NUM, 'failed inspections (%)')
heatmap_chicago
Out[40]:
In [41]:
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"map_restaurant_pass.html")

As a final conclusion:

The center and north of Chicago mostly have higher life expectancies and lower percentages of households below the poverty line.

With respect to food inspections, these areas also have more restaurants, and more food inspections per restaurant than the south.

Statistics :

Socioeconomic indicators

We report some statistics on the various dataframes.

Table of Contents

In [42]:
# Check: range, mean with confidence interval.
important_columns = ['community_area_num', 'community_area_name', 'housing_crowded_perc',
       'housholds_below_poverty_perc', 'aged_16_or_more_unemployed_perc',
       'aged_25_or_more_without_high_school_diploma_perc',
       'aged_under_18_or_over_64_perc', 'per_capita_income', 'hardship_idx',
       'life_exp_2010']

socio_life_merged_DF[important_columns].describe()
Out[42]:
housing_crowded_perc housholds_below_poverty_perc aged_16_or_more_unemployed_perc aged_25_or_more_without_high_school_diploma_perc aged_under_18_or_over_64_perc per_capita_income hardship_idx life_exp_2010
count 78.000000 78.000000 78.000000 78.000000 78.000000 78.000000 77.000000 73.000000
mean 4.920513 21.739744 15.341026 20.330769 35.717949 25597.000000 49.506494 77.597260
std 3.658981 11.457231 7.499497 11.746514 7.284421 15196.405541 28.690556 4.130879
min 0.300000 3.300000 4.700000 2.500000 13.500000 8201.000000 1.000000 68.800000
25% 2.325000 13.350000 9.200000 12.075000 32.150000 15804.750000 25.000000 74.400000
50% 3.850000 19.050000 13.850000 18.650000 38.050000 21668.500000 50.000000 79.500000
75% 6.800000 29.150000 20.000000 26.600000 40.500000 28715.750000 74.000000 80.500000
max 15.800000 56.500000 35.900000 54.800000 51.500000 88669.000000 98.000000 85.200000

We can see above the mean, standart deviations and confidence intervals for some columns that highlight the socio economic factors of all areas. Of course in the database all community areas are considered equally, and so this does not take into account the sizes of the areas, or the number of people in it. However it is still interesting to analyse this result, to get some insight on what we are dealing with. For example it is surprising that the standart deviation of the income per capita is more than half of its mean! Or that the life expectancies in all areas have an std of more than 4 years.

In what follows we analyse more closely the correlations between the different features in this dataframe.

In [43]:
corr = socio_life_merged_DF[cst.SOCIOECONOMIC_METRICS].corr()
corr
Out[43]:
housing_crowded_perc housholds_below_poverty_perc aged_16_or_more_unemployed_perc aged_25_or_more_without_high_school_diploma_perc per_capita_income hardship_idx life_exp_2010 aged_under_18_or_over_64_perc
housing_crowded_perc 1.000000 0.319403 0.165299 0.875959 -0.541730 0.649574 -0.044064 0.224692
housholds_below_poverty_perc 0.319403 1.000000 0.800084 0.424294 -0.567025 0.803267 -0.691029 0.435894
aged_16_or_more_unemployed_perc 0.165299 0.800084 1.000000 0.355518 -0.656619 0.792294 -0.797766 0.676532
aged_25_or_more_without_high_school_diploma_perc 0.875959 0.424294 0.355518 1.000000 -0.709770 0.802538 -0.136151 0.408878
per_capita_income -0.541730 -0.567025 -0.656619 -0.709770 1.000000 -0.849167 0.566589 -0.754844
hardship_idx 0.649574 0.803267 0.792294 0.802538 -0.849167 1.000000 -0.616442 0.690844
life_exp_2010 -0.044064 -0.691029 -0.797766 -0.136151 0.566589 -0.616442 1.000000 -0.566358
aged_under_18_or_over_64_perc 0.224692 0.435894 0.676532 0.408878 -0.754844 0.690844 -0.566358 1.000000

We now want to make sure that the correlation between all two pairs of metrics both good or both bad is always negative, and that the correlation between 2 pairs of variables of different type is always negative. For this, we create the boolean variable sign_kept: only if the above condition is always met, then sign_kept will be true.

In [44]:
bad_metrics = set(['housing_crowded_perc', 'housholds_below_poverty_perc', 'aged_16_or_more_unemployed_perc', 
               'aged_25_or_more_without_high_school_diploma_perc', 'hardship_idx', 'aged_under_18_or_over_64_perc'])
good_metrics = set(['per_capita_income', 'life_exp_2010' ])
sign_kept = True

for c1 in cst.SOCIOECONOMIC_METRICS:
    for c2 in cst.SOCIOECONOMIC_METRICS:
        if (c1 in bad_metrics and c2 in bad_metrics) or (c1 in good_metrics and c2 in good_metrics):
            if corr[c1][c2] < 0:
                sign_kept = False
        elif (c1 in bad_metrics and c2 in good_metrics) or (c1 in good_metrics and c2 in bad_metrics):
            if corr[c1][c2] > 0:
                sign_kept = False
                
if sign_kept: 
    print('The correlation between indicators both good or both bad is always positive,\n and the correlation between a good and a bad indicator is always negative')
The correlation between indicators both good or both bad is always positive,
 and the correlation between a good and a bad indicator is always negative
In [45]:
# Set correlation between each variable and itself to None in order to ignore it later
for c in corr.columns:
    corr[c][c] = None 
    
corrmax =pd.DataFrame(corr.idxmax()).rename({0: 'Strongest positive correlation'}, axis = 1)
corrmax['Correlation value'] = corr.max()
corrmax
Out[45]:
Strongest positive correlation Correlation value
housing_crowded_perc aged_25_or_more_without_high_school_diploma_perc 0.875959
housholds_below_poverty_perc hardship_idx 0.803267
aged_16_or_more_unemployed_perc housholds_below_poverty_perc 0.800084
aged_25_or_more_without_high_school_diploma_perc housing_crowded_perc 0.875959
per_capita_income life_exp_2010 0.566589
hardship_idx housholds_below_poverty_perc 0.803267
life_exp_2010 per_capita_income 0.566589
aged_under_18_or_over_64_perc hardship_idx 0.690844
In [46]:
corrmin =pd.DataFrame(corr.idxmin()).rename({0: 'Strongest negative correlation'}, axis = 1)
corrmin['Correlation value'] = corr.min()
corrmin
Out[46]:
Strongest negative correlation Correlation value
housing_crowded_perc per_capita_income -0.541730
housholds_below_poverty_perc life_exp_2010 -0.691029
aged_16_or_more_unemployed_perc life_exp_2010 -0.797766
aged_25_or_more_without_high_school_diploma_perc per_capita_income -0.709770
per_capita_income hardship_idx -0.849167
hardship_idx per_capita_income -0.849167
life_exp_2010 aged_16_or_more_unemployed_perc -0.797766
aged_under_18_or_over_64_perc per_capita_income -0.754844

Of the above correlations, we notice certain things: Firstly, we can classify the indicators between good (life expectancy and per capita income) and bad (percentage of crowded houses, percentage of below porverty households, percentage of over 16 unemployed people, percentage fo over 25 people without a high school diploma, the hardship index, and the percentage of people under 18 and over 64), and the correlations between indicators either both good or both bad will always be positive, whereas the correlation between a good and a bad indicator will always be negative (sign_kept).

We also notice that the percentage of people under 18 or over 64 is a strong negative indicator: it is more negatively correlated to per capita income than, for example, the percentage of houses living below the poverty line.

It is indeed quite surprising that per capita average income is not more correlated to the percentage of houses living below the poverty line (correlation is -0.56). We plot the 2 metrics in order to see this:

One reason the linear correlation is so low is that the relationship is exponential. However, we also notice that thetop 5 highest per capita neighbourhoods are not in the top 15 lowest poor households percentage.

We now plot a graph showing the per capita income on the y-axis and the housholds below poverty on the x-axis, of every community area. The plot is interactive, you can hover over a point to see its community area name and the precise values reported and click on it to see how other areas of the same region perform.

In [47]:
plotly.offline.init_notebook_mode(connected=True)
In [48]:
from utils import interactive_plot_handler
from utils.interactive_plot_handler import region_to_color, name_to_idx, name_to_region
In [49]:
# Add a 'region' column based on # https://en.wikipedia.org/wiki/Chicago#/media/File:Chicago_community_areas_map.svg
areas_DF['region'] = areas_DF['community_area_name'].apply(lambda name: name_to_region[name])
In [50]:
# Display the Chicago regions
# This map contains additional layers to visually check if the found locations are actually within the borders of the city
map_chicago = maps.create_chicago_map(with_community_areas=True, with_regions=True, areas_DF=areas_DF)
map_chicago
Out[50]:
In [51]:
# Save map
map_chicago.save(outfile=cst.MAPS_PATH+"map_chicago_regions_colored.html")
In [52]:
# Add a 'region' column based on # https://en.wikipedia.org/wiki/Chicago#/media/File:Chicago_community_areas_map.svg
socio_life_merged_DF['region'] = socio_life_merged_DF['community_area_name'].apply(lambda name: name_to_region[name])
In [53]:
fig = interactive_plot_handler.interactive_plot_capita_income_wrt_households(socio_life_merged_DF)
fig
In [54]:
# Saves the figure depending on which point you click -> replace interactivity by showing right figure on page.
#region = 'all'
#fig_path = cst.FIG_PATH + 'comm_area_per_income_below_poverty_'+ region + '.html'
#fig.write_html(fig_path)

Predictions of Inspection: Will a facility get inspected?

In this section we attempt to predict if an establishment will be inspected in 2019. We therefore define the problem as a binary classification, and, in order to understand the accuracy, we define the period as starting from April 2018.

What we want to see is not so much whether or not we can predict this, but the effect that each factor has. We use Logistic Regression. For the categorical variables, we use one hot encoding. In order to compare the weights and see the importance each feature has in the decision, we use the MinMax scaler.

Table of Contents

In [55]:
from datetime import datetime
from tqdm import tqdm_notebook as tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

from utils import predictions_helpers
from utils.predictions_helpers import risk_to_num, inspected_after_2018, n_inspections_before_2018, \
                                      results_to_number, int_of_x
In [56]:
food_inspections_DF['risk'] = food_inspections_DF['risk'].apply(risk_to_num)

d = {
     'facility_type': lambda l: list(l)[0],
     'risk': list,
     'inspection_date': list,
     'inspection_type': list,
     'result': list,
     'community_area_num': lambda l: list(l)[0]
}

#build a dataframe in which each row is a facility
establishment_centric = food_inspections_DF.groupby('license_num').agg(d).loc[1:]

#Drop establishments only inspected once
establishment_centric_many =  establishment_centric[establishment_centric.risk.apply(len) > 2].reset_index()
In [57]:
#See all possible result to inspections
all_options = set()
for e in establishment_centric_many.result:
    for i in e:
        all_options.add(i)
print(all_options)
{'Not Ready', 'Pass w/ Conditions', 'Fail', 'Out of Business', 'No Entry', 'Pass'}
In [58]:
#based on the performance between 2010 - 2017, predict if there is an inspection in 2019
establishment_centric_many['inspected_after_2018'] = establishment_centric_many.inspection_date.apply(inspected_after_2018)
establishment_centric_many['average_risk'] = establishment_centric_many.risk.apply(np.mean)
establishment_centric_many['n_inspections'] = establishment_centric_many.inspection_date.apply(n_inspections_before_2018)
establishment_centric_many['average_result'] = establishment_centric_many.result.apply(results_to_number)
In [59]:
establishment_centric_many = establishment_centric_many[~ establishment_centric_many.average_risk.isna()]
    

socio_life_merged_DF['community_area_num'] = socio_life_merged_DF.community_area_num.apply(int_of_x)
establishment_centric_many_social = establishment_centric_many.merge(socio_life_merged_DF.dropna(), on = 'community_area_num').reset_index()

#one hot encode facility type
establishment_centric_many_merged = establishment_centric_many_social.reset_index().merge(pd.get_dummies(establishment_centric_many.facility_type).reset_index()\
                                                                                   , on = 'index')

establishment_centric_many_merged['per_capita_income_log'] = establishment_centric_many_merged['per_capita_income'].apply(np.log)

#define the socio economic metrics 
bad_metrics = list(['housing_crowded_perc', 'housholds_below_poverty_perc', 'aged_16_or_more_unemployed_perc', 
               'aged_25_or_more_without_high_school_diploma_perc', 'hardship_idx', 'aged_under_18_or_over_64_perc'])

good_metrics = list(['per_capita_income', 'life_exp_2010' ])
In [60]:
#Let's look at the risk for some facility types:
establishment_centric_many_merged.groupby('facility_type').agg({'risk': lambda x: np.mean([e[0] for e in x])}).sort_values('risk').head(20)
Out[60]:
risk
facility_type
GROCERY/DRUG STORE 1.0
GROCERY STORE / GAS STATION 1.0
THEATRE 1.0
Rest/GYM 1.0
GAS STATION/STORE 1.0
RETAIL FOOD/GAS STATION 1.0
CATERING/CAFE 1.0
CONVENIENCE 1.0
CANDY STORE 1.0
Mobile Frozen Desserts Vendor 1.0
WINE STORE 1.0
GROCERY/GAS STATION 1.0
bar 1.0
COLD/FROZEN FOOD STORAGE 1.0
BAR 1.0
GAS STATION/STORE GROCERY 1.0
convenience/drug store 1.0
Laundromat 1.0
convenience 1.0
grocery/dollar store 1.0
In [61]:
establishment_centric_many_merged.groupby('facility_type').agg({'risk': lambda x: np.mean([-e[0] for e in x])}).sort_values('risk').head(20)
Out[61]:
risk
facility_type
1023 -3.0
NURSING HOME -3.0
MOBILE FOOD -3.0
MAIN KITCHEN -3.0
Long-Term Care Facility -3.0
Long Term Care Facility -3.0
Long Term Care -3.0
LOUNGE/BANQUET HALL -3.0
Hospital -3.0
HOTEL -3.0
school cafeteria -3.0
Grocery(Sushi prep) -3.0
Grocery & Restaurant -3.0
Golden Diner -3.0
GROCERY/TAQUERIA -3.0
GROCERY/RESTAURANT -3.0
GROCERY/BAKERY -3.0
GROCERY STORE/TAQUERIA -3.0
GROCERY STORE/COOKING SCHOOL -3.0
GROCERY STORE/BAKERY -3.0
In [62]:
corr_withfood = establishment_centric_many_merged[ good_metrics + bad_metrics + ['average_risk', 'n_inspections', 'average_result']].corr()
corr_withfood
Out[62]:
per_capita_income life_exp_2010 housing_crowded_perc housholds_below_poverty_perc aged_16_or_more_unemployed_perc aged_25_or_more_without_high_school_diploma_perc hardship_idx aged_under_18_or_over_64_perc average_risk n_inspections average_result
per_capita_income 1.000000 0.681754 -0.629377 -0.567091 -0.679461 -0.776762 -0.846751 -0.790438 0.128811 0.038932 0.084931
life_exp_2010 0.681754 1.000000 -0.213135 -0.673889 -0.813926 -0.328400 -0.638769 -0.656274 0.190872 0.060243 0.132003
housing_crowded_perc -0.629377 -0.213135 1.000000 0.473572 0.371424 0.905316 0.765819 0.469260 -0.048614 -0.019507 -0.063819
housholds_below_poverty_perc -0.567091 -0.673889 0.473572 1.000000 0.776675 0.533233 0.777123 0.479571 -0.151099 -0.038565 -0.158198
aged_16_or_more_unemployed_perc -0.679461 -0.813926 0.371424 0.776675 1.000000 0.524965 0.826482 0.752770 -0.188769 -0.043049 -0.127207
aged_25_or_more_without_high_school_diploma_perc -0.776762 -0.328400 0.905316 0.533233 0.524965 1.000000 0.883654 0.629489 -0.068116 -0.023758 -0.065610
hardship_idx -0.846751 -0.638769 0.765819 0.777123 0.826482 0.883654 1.000000 0.812700 -0.147231 -0.040414 -0.112617
aged_under_18_or_over_64_perc -0.790438 -0.656274 0.469260 0.479571 0.752770 0.629489 0.812700 1.000000 -0.143731 -0.038847 -0.082917
average_risk 0.128811 0.190872 -0.048614 -0.151099 -0.188769 -0.068116 -0.147231 -0.143731 1.000000 0.194711 0.116296
n_inspections 0.038932 0.060243 -0.019507 -0.038565 -0.043049 -0.023758 -0.040414 -0.038847 0.194711 1.000000 0.080273
average_result 0.084931 0.132003 -0.063819 -0.158198 -0.127207 -0.065610 -0.112617 -0.082917 0.116296 0.080273 1.000000
In [63]:
for c in corr_withfood.columns:
    corr_withfood[c][c] = None #Avoid bright diagonal
    
cols = ['per18-64', 'hardship',\
                     'perNoHighschool', 'Unemployment', 'Per poverty', 'Per crowded', 'Life exp', 'Income'][::-1]
    
fig = go.Figure(data=go.Heatmap(
                   z=corr_withfood[['average_result', 'n_inspections', 'average_risk']],
                   x=['average_result', 'n_inspections', 'average_risk'],
                   y=cols,
                   hoverongaps = False,
                   colorscale=px.colors.diverging.Tealrose))
fig.show()
In [64]:
plotly.offline.plot(fig, filename='../docs/img/corr_socio_good.html', auto_open=False);
In [65]:
features = list(establishment_centric_many.facility_type.unique())

for feature in features:
    if(sum(establishment_centric_many_merged[feature])<30):
        print('Removing feature... ' + feature)
        features.remove(feature)
        
features += ['average_risk','average_result', 'n_inspections']
Removing feature... RESTAURANT/GROCERY STORE
Removing feature... LOUNGE/BANQUET HALL
Removing feature... GROCERY/RESTAURANT
Removing feature... Wholesale
Removing feature... butcher shop
Removing feature... fish market
Removing feature... PRIVATE SCHOOL
Removing feature... BOWLING LANES/BANQUETS
Removing feature... CULINARY SCHOOL
Removing feature... Grocery & Restaurant
Removing feature... BANQUET HALL
Removing feature... NURSING HOME
Removing feature... Coffee shop
Removing feature... BANQUET FACILITY
Removing feature... banquets
Removing feature... JUICE BAR
Removing feature... GAS STATION/MINI MART
Removing feature... RESTAURANT.BANQUET HALLS
Removing feature... RESTAURANT/GAS STATION
Removing feature... HOT DOG STATION
Removing feature... PASTRY school
Removing feature... grocery & restaurant
Removing feature... GAS STATION/RESTAURANT
Removing feature... LIVE POULTRY
Removing feature... GROCERY STORE/BAKERY
Removing feature... ROOFTOPS
Removing feature... ASSISTED LIVING
Removing feature... ROOF TOPS
Removing feature... watermelon house
Removing feature... GROCERY/DRUG STORE
Removing feature... SUPPORTIVE LIVING FACILITY
Removing feature... COFFEE KIOSK
Removing feature... GROCERY STORE/COOKING SCHOOL
Removing feature... meat packing
Removing feature... FITNESS CENTER
Removing feature... Grocery(Sushi prep)
Removing feature... Private School
Removing feature... CHURCH KITCHEN
Removing feature... Daycare (2 Years)
Removing feature... Assisted Living Senior Care
Removing feature... banquets/room service
Removing feature... ICE CREAM SHOP
Removing feature... CHARITY AID KITCHEN
Removing feature... SLAUGHTER HOUSE/ GROCERY
Removing feature... A-Not-For-Profit Chef Training Program
Removing feature... CITY OF CHICAGO COLLEGE
Removing feature... Ice cream
Removing feature... BAR
Removing feature... SENIOR DAY CARE
Removing feature... COFFEE SHOP
Removing feature... Banquet Hall
Removing feature... CUSTOM POULTRY SLAUGHTER
Removing feature... convenience/drug store
Removing feature... MAIN KITCHEN
Removing feature... GAS STATION/CONVENIENCE STORE
Removing feature... BAKERY/DELI
Removing feature... Live Poultry
Removing feature... RESTAURANT/BAKERY
Removing feature... grocery/dollar store
Removing feature... Long-Term Care Facility
Removing feature... GROCERY/BAKERY
Removing feature... JUICE AND SALAD BAR
Removing feature... CANDY/GELATO
Removing feature... GROCERY AND BUTCHER
Removing feature... GAS STATION/STORE
Removing feature... CATERING/CAFE
Removing feature... CONVENIENCE
Removing feature... RETAIL FOOD/GAS STATION
Removing feature... REHAB CENTER
Removing feature... UNIVERSITY CAFETERIA
Removing feature... SCHOOL
Removing feature... TEA BREWING
Removing feature... Mobile Frozen Desserts Vendor
Removing feature... MOBILE FROZEN DESSERTS VENDOR
Removing feature... CHILDRENS SERVICES FACILITY
Removing feature... GROCERY/TAQUERIA
Removing feature... 15 monts to 5 years old
Removing feature... WINE STORE
Removing feature... Banquet rooms
Removing feature... CHILDERN'S SERVICE FACILITY
Removing feature... REGULATED BUSINESS
Removing feature... CHARTER SCHOOL CAFETERIA
Removing feature... COOKING SCHOOL
Removing feature... BREWPUB
Removing feature... COMMISSARY
Removing feature... DAYCARE 2 YRS TO 12 YRS
Removing feature... Poultry Slaughter
Removing feature... Theater & Restaurant
Removing feature... DINING HALL
Removing feature... CANDY SHOP
Removing feature... VENDING COMMISSARY
In [66]:
establishment_centric_many_merged = establishment_centric_many_merged.dropna(subset=features)
In [67]:
#Make classes completely balanced in the validation dataset by dropping a few inspections
val_set = establishment_centric_many_merged.sort_values('inspected_after_2018')
val_set = val_set.iloc[:-943]
In [68]:
#Train a Logistic regression model
target = 'inspected_after_2018'

val_set = val_set.dropna(subset=features + [target])
#LetÅ› fo min max scaler
#To estimate the coefficients and the accuracy we use Monte Carlo crossvalidation
coefs = []
accs = []

for _ in tqdm(range(100)):
    train, test = train_test_split(val_set)
    sc = MinMaxScaler()
    X = sc.fit_transform(train[features])
    reg = LogisticRegression(solver='liblinear') #optimize alpha letter
    reg.fit(X, train[target])
    coefs.append(reg.coef_[0])
    X_test = sc.transform(test[features])
    accs.append(accuracy_score(reg.predict(X_test), test[target]))

print('The mean accuracy estimated using Montecarlo cross validation is %1.3f +- %1.3f, compared to a random accuracy of 0.5. '%(np.mean(accs), 1.64*np.std(accs)/np.sqrt(len(accs))))
The mean accuracy estimated using Montecarlo cross validation is 0.607 +- 0.002, compared to a random accuracy of 0.5. 
In [69]:
#Let's find the mean of the coefficients 
coef_mean = np.mean(coefs, axis=0)
coef_std = np.std(coefs, axis=0)

coeffs = []
margins = []
names = []
for c, std, f in sorted(zip(coef_mean, coef_std, features), key = lambda x: -x[0]):
    names.append(f.upper())
    coeffs.append(c)
    margins.append(1.64*std/np.sqrt(len(coefs)))
    print('%s   -Coefficient: %2.3f. -Confidence interval 90 per cent: %f to %f.(%f)'%(f.upper(), c, c-1.64*std/np.sqrt(len(coefs)), c+1.64*std/np.sqrt(len(coefs)),1.64*std/np.sqrt(len(coefs)) ))
    
N_INSPECTIONS   -Coefficient: 6.149. -Confidence interval 90 per cent: 5.982415 to 6.315322.(0.166453)
AVERAGE_RISK   -Coefficient: 1.775. -Confidence interval 90 per cent: 1.768451 to 1.781577.(0.006563)
MOBILE DESSERTS VENDOR   -Coefficient: 0.768. -Confidence interval 90 per cent: 0.729817 to 0.805728.(0.037956)
BANQUET   -Coefficient: 0.728. -Confidence interval 90 per cent: 0.708717 to 0.746372.(0.018828)
STADIUM   -Coefficient: 0.578. -Confidence interval 90 per cent: 0.552805 to 0.603394.(0.025295)
SHELTER   -Coefficient: 0.496. -Confidence interval 90 per cent: 0.450208 to 0.541538.(0.045665)
RIVERWALK   -Coefficient: 0.467. -Confidence interval 90 per cent: 0.426635 to 0.508132.(0.040748)
CAFETERIA   -Coefficient: 0.416. -Confidence interval 90 per cent: 0.375187 to 0.456064.(0.040438)
HOSPITAL   -Coefficient: 0.409. -Confidence interval 90 per cent: 0.379674 to 0.439253.(0.029789)
LONG TERM CARE   -Coefficient: 0.387. -Confidence interval 90 per cent: 0.371438 to 0.403490.(0.016026)
GROCERY   -Coefficient: 0.375. -Confidence interval 90 per cent: 0.340381 to 0.409599.(0.034609)
LAUNDROMAT   -Coefficient: 0.360. -Confidence interval 90 per cent: 0.317846 to 0.402728.(0.042441)
GROCERY STORE/GAS STATION   -Coefficient: 0.338. -Confidence interval 90 per cent: 0.320056 to 0.356905.(0.018425)
LONG TERM CARE FACILITY   -Coefficient: 0.328. -Confidence interval 90 per cent: 0.307690 to 0.348796.(0.020553)
SMOKEHOUSE   -Coefficient: 0.322. -Confidence interval 90 per cent: 0.295910 to 0.348915.(0.026502)
RETAIL STORE OFFERS COOKING CLASSES   -Coefficient: 0.309. -Confidence interval 90 per cent: 0.280997 to 0.336435.(0.027719)
1023 CHILDREN'S SERVICES FACILITY   -Coefficient: 0.301. -Confidence interval 90 per cent: 0.270869 to 0.331003.(0.030067)
RESTAURANT/BAR   -Coefficient: 0.300. -Confidence interval 90 per cent: 0.260746 to 0.339149.(0.039202)
GROCERY& RESTAURANT   -Coefficient: 0.296. -Confidence interval 90 per cent: 0.268528 to 0.323187.(0.027330)
BANQUET ROOM   -Coefficient: 0.288. -Confidence interval 90 per cent: 0.263209 to 0.313569.(0.025180)
1023-CHILDREN'S SERVICES FACILITY   -Coefficient: 0.285. -Confidence interval 90 per cent: 0.235803 to 0.334949.(0.049573)
MOBILE FOOD TRUCK   -Coefficient: 0.284. -Confidence interval 90 per cent: 0.253211 to 0.314242.(0.030516)
1023   -Coefficient: 0.283. -Confidence interval 90 per cent: 0.252348 to 0.314232.(0.030942)
BAR   -Coefficient: 0.274. -Confidence interval 90 per cent: 0.247776 to 0.299718.(0.025971)
AFTER SCHOOL PROGRAM   -Coefficient: 0.267. -Confidence interval 90 per cent: 0.244070 to 0.290756.(0.023343)
HOTEL   -Coefficient: 0.261. -Confidence interval 90 per cent: 0.235006 to 0.287178.(0.026086)
CAFETERIA   -Coefficient: 0.260. -Confidence interval 90 per cent: 0.234615 to 0.285239.(0.025312)
GAS STATION/STORE GROCERY   -Coefficient: 0.258. -Confidence interval 90 per cent: 0.232678 to 0.282904.(0.025113)
MOBILE FOOD PREPARER   -Coefficient: 0.254. -Confidence interval 90 per cent: 0.227269 to 0.280450.(0.026590)
RESTAURANT/HOSPITAL   -Coefficient: 0.252. -Confidence interval 90 per cent: 0.227033 to 0.276021.(0.024494)
DAYCARE (2 - 6 YEARS)   -Coefficient: 0.247. -Confidence interval 90 per cent: 0.230969 to 0.263116.(0.016074)
BEFORE AND AFTER SCHOOL PROGRAM   -Coefficient: 0.233. -Confidence interval 90 per cent: 0.211883 to 0.253722.(0.020919)
COLD/FROZEN FOOD STORAGE   -Coefficient: 0.231. -Confidence interval 90 per cent: 0.212597 to 0.249449.(0.018426)
RESTAURANT(PROTEIN SHAKE BAR)   -Coefficient: 0.229. -Confidence interval 90 per cent: 0.203469 to 0.253842.(0.025186)
GROCERY STORE / GAS STATION   -Coefficient: 0.228. -Confidence interval 90 per cent: 0.205343 to 0.250946.(0.022802)
DAYCARE (UNDER 2 YEARS)   -Coefficient: 0.225. -Confidence interval 90 per cent: 0.196980 to 0.253555.(0.028287)
REST/ROOFTOP   -Coefficient: 0.223. -Confidence interval 90 per cent: 0.201186 to 0.244644.(0.021729)
MOVIE THEATRE   -Coefficient: 0.217. -Confidence interval 90 per cent: 0.198931 to 0.235822.(0.018446)
REST/GROCERY   -Coefficient: 0.215. -Confidence interval 90 per cent: 0.188693 to 0.241708.(0.026508)
GROCERY/TAVERN   -Coefficient: 0.209. -Confidence interval 90 per cent: 0.186944 to 0.232016.(0.022536)
RESTUARANT AND BAR   -Coefficient: 0.206. -Confidence interval 90 per cent: 0.187008 to 0.225121.(0.019057)
MOBILE FOOD DISPENSER   -Coefficient: 0.184. -Confidence interval 90 per cent: 0.145957 to 0.221087.(0.037565)
BAKERY   -Coefficient: 0.178. -Confidence interval 90 per cent: 0.163107 to 0.193706.(0.015300)
TAVERN   -Coefficient: 0.160. -Confidence interval 90 per cent: 0.117195 to 0.202378.(0.042592)
1023 CHILDERN'S SERVICES FACILITY   -Coefficient: 0.102. -Confidence interval 90 per cent: 0.058288 to 0.145455.(0.043584)
DAYCARE ABOVE AND UNDER 2 YEARS   -Coefficient: 0.075. -Confidence interval 90 per cent: 0.060051 to 0.089202.(0.014575)
CHILDREN'S SERVICES FACILITY   -Coefficient: 0.068. -Confidence interval 90 per cent: 0.055537 to 0.080815.(0.012639)
DAYCARE NIGHT   -Coefficient: 0.066. -Confidence interval 90 per cent: 0.025495 to 0.106352.(0.040428)
CATERING   -Coefficient: 0.036. -Confidence interval 90 per cent: 0.019079 to 0.053457.(0.017189)
CONVENIENCE   -Coefficient: 0.000. -Confidence interval 90 per cent: 0.000000 to 0.000000.(0.000000)
COFFEE  SHOP   -Coefficient: 0.000. -Confidence interval 90 per cent: 0.000000 to 0.000000.(0.000000)
COLLEGE   -Coefficient: 0.000. -Confidence interval 90 per cent: 0.000000 to 0.000000.(0.000000)
WRIGLEY ROOFTOP   -Coefficient: 0.000. -Confidence interval 90 per cent: 0.000000 to 0.000000.(0.000000)
HEALTH/ JUICE BAR   -Coefficient: 0.000. -Confidence interval 90 per cent: 0.000000 to 0.000000.(0.000000)
BANQUET HALL/CATERING   -Coefficient: 0.000. -Confidence interval 90 per cent: 0.000000 to 0.000000.(0.000000)
MOBILE FOOD   -Coefficient: 0.000. -Confidence interval 90 per cent: 0.000000 to 0.000000.(0.000000)
DAYCARE   -Coefficient: -0.025. -Confidence interval 90 per cent: -0.063711 to 0.014607.(0.039159)
CONVENIENCE STORE   -Coefficient: -0.055. -Confidence interval 90 per cent: -0.097899 to -0.012416.(0.042742)
SHARED KITCHEN   -Coefficient: -0.056. -Confidence interval 90 per cent: -0.103305 to -0.009616.(0.046844)
GROCERY STORE   -Coefficient: -0.061. -Confidence interval 90 per cent: -0.071177 to -0.050652.(0.010263)
RESTAURANT   -Coefficient: -0.062. -Confidence interval 90 per cent: -0.071743 to -0.052434.(0.009655)
PALETERIA   -Coefficient: -0.079. -Confidence interval 90 per cent: -0.120900 to -0.037448.(0.041726)
THEATRE   -Coefficient: -0.084. -Confidence interval 90 per cent: -0.127361 to -0.040280.(0.043540)
EMPLOYEE KITCHEN   -Coefficient: -0.096. -Confidence interval 90 per cent: -0.104157 to -0.086919.(0.008619)
TEACHING SCHOOL   -Coefficient: -0.119. -Confidence interval 90 per cent: -0.129660 to -0.108770.(0.010445)
SPECIAL EVENT   -Coefficient: -0.149. -Confidence interval 90 per cent: -0.185368 to -0.113489.(0.035940)
SCHOOL   -Coefficient: -0.154. -Confidence interval 90 per cent: -0.164442 to -0.142764.(0.010839)
LIQUOR   -Coefficient: -0.168. -Confidence interval 90 per cent: -0.197101 to -0.138247.(0.029427)
WRIGLEY ROOF TOP   -Coefficient: -0.173. -Confidence interval 90 per cent: -0.210117 to -0.136306.(0.036906)
ROOF TOP   -Coefficient: -0.179. -Confidence interval 90 per cent: -0.197888 to -0.159442.(0.019223)
GOLDEN DINER   -Coefficient: -0.184. -Confidence interval 90 per cent: -0.209740 to -0.157424.(0.026158)
COFFEE SHOP   -Coefficient: -0.185. -Confidence interval 90 per cent: -0.218792 to -0.151810.(0.033491)
CHARTER SCHOOL   -Coefficient: -0.185. -Confidence interval 90 per cent: -0.205394 to -0.165491.(0.019952)
1023 CHILDERN'S SERVICE FACILITY   -Coefficient: -0.191. -Confidence interval 90 per cent: -0.213249 to -0.169159.(0.022045)
BANQUET DINING   -Coefficient: -0.199. -Confidence interval 90 per cent: -0.216519 to -0.181721.(0.017399)
GROCERY/GAS STATION   -Coefficient: -0.202. -Confidence interval 90 per cent: -0.218257 to -0.185064.(0.016596)
SNACK SHOP   -Coefficient: -0.213. -Confidence interval 90 per cent: -0.231991 to -0.193725.(0.019133)
SCHOOL CAFETERIA   -Coefficient: -0.250. -Confidence interval 90 per cent: -0.274700 to -0.225917.(0.024392)
CAFETERIA   -Coefficient: -0.262. -Confidence interval 90 per cent: -0.296131 to -0.227544.(0.034293)
BANQUET HALL   -Coefficient: -0.276. -Confidence interval 90 per cent: -0.304929 to -0.246999.(0.028965)
DAY CARE 2-14   -Coefficient: -0.277. -Confidence interval 90 per cent: -0.306057 to -0.247888.(0.029084)
WAREHOUSE   -Coefficient: -0.304. -Confidence interval 90 per cent: -0.331420 to -0.276773.(0.027323)
DONUT SHOP   -Coefficient: -0.312. -Confidence interval 90 per cent: -0.345213 to -0.278193.(0.033510)
CAFE/STORE   -Coefficient: -0.313. -Confidence interval 90 per cent: -0.345757 to -0.280079.(0.032839)
GAS STATION   -Coefficient: -0.316. -Confidence interval 90 per cent: -0.351116 to -0.280170.(0.035473)
BANQUET/KITCHEN   -Coefficient: -0.317. -Confidence interval 90 per cent: -0.349460 to -0.284534.(0.032463)
PUBLIC SHCOOL   -Coefficient: -0.331. -Confidence interval 90 per cent: -0.359521 to -0.301822.(0.028849)
OTHER   -Coefficient: -0.331. -Confidence interval 90 per cent: -0.362545 to -0.299683.(0.031431)
CULINARY ARTS SCHOOL   -Coefficient: -0.335. -Confidence interval 90 per cent: -0.365921 to -0.304078.(0.030922)
REST/GYM   -Coefficient: -0.343. -Confidence interval 90 per cent: -0.380762 to -0.305276.(0.037743)
GROCERY STORE/ RESTAURANT   -Coefficient: -0.347. -Confidence interval 90 per cent: -0.380506 to -0.313009.(0.033748)
SUPPORTIVE LIVING   -Coefficient: -0.347. -Confidence interval 90 per cent: -0.377472 to -0.316891.(0.030291)
DAYCARE COMBO 1586   -Coefficient: -0.350. -Confidence interval 90 per cent: -0.372376 to -0.327790.(0.022293)
CONVENIENCE STORE   -Coefficient: -0.354. -Confidence interval 90 per cent: -0.398496 to -0.309035.(0.044730)
CANDY STORE   -Coefficient: -0.356. -Confidence interval 90 per cent: -0.380430 to -0.331269.(0.024580)
GROCERY STORE/TAQUERIA   -Coefficient: -0.356. -Confidence interval 90 per cent: -0.394405 to -0.317860.(0.038272)
BAR/GRILL   -Coefficient: -0.370. -Confidence interval 90 per cent: -0.396764 to -0.343643.(0.026561)
HOSTEL   -Coefficient: -0.374. -Confidence interval 90 per cent: -0.406164 to -0.341006.(0.032579)
COOKING SCHOOL   -Coefficient: -0.407. -Confidence interval 90 per cent: -0.438630 to -0.375942.(0.031344)
DELI   -Coefficient: -0.458. -Confidence interval 90 per cent: -0.496154 to -0.420825.(0.037664)
AVERAGE_RESULT   -Coefficient: -0.525. -Confidence interval 90 per cent: -0.533046 to -0.517243.(0.007902)
ROOFTOP   -Coefficient: -0.526. -Confidence interval 90 per cent: -0.553269 to -0.499096.(0.027087)
ASSISTED LIVING   -Coefficient: -0.538. -Confidence interval 90 per cent: -0.567163 to -0.509242.(0.028960)
In [70]:
fig = go.Figure()
fig.add_trace(go.Bar(
    name="Upper",
    x=names[:5], y=coeffs[:5],
    error_y=dict(type='data', array=margins[:5]),
     showlegend=False
))
fig.add_trace(go.Bar(
     name="Lower",
    x=names[-5:], y=coeffs[-5:],
    error_y=dict(type='data', array=margins[-5:]),
     showlegend=False
))
fig.update_layout(
     margin=dict(l=0, r=0, t=0, b=0),
    autosize=True,
    width=530,
    height=450,
)
fig.show()
In [71]:
plotly.offline.plot(fig, filename='../docs/img/bar_coeff.html', auto_open=False);